pacman::p_load(ggplot2, dplyr, party, caret, corrplot)
products2017 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributes2017.csv")
products2018 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributesPast.csv")


#Remove Na
# ncol(is.na(products2017))
# summary(is.na(products2017))
# na.omit(products2017)
# summary(products2017)
# complete.cases(products2017)
# str(products2017)
# products2017 <-  products2017[complete.cases(products2017),]
# str(products2017)

#Remove the column with Na
products2017$BestSellersRank <- NULL

#Find patterns in the data

#Select only numeric values Just for the PLOTS and the CORRELATION MATRIX V.1
products2017Num <- products2017[c(-1,-2)]
str(products2017Num)
## 'data.frame':    80 obs. of  15 variables:
##  $ Price                : num  949 2250 399 410 1080 ...
##  $ x5StarReviews        : int  3 2 3 49 58 83 11 33 16 10 ...
##  $ x4StarReviews        : int  3 1 0 19 31 30 3 19 9 1 ...
##  $ x3StarReviews        : int  2 0 0 8 11 10 0 12 2 1 ...
##  $ x2StarReviews        : int  0 0 0 3 7 9 0 5 0 0 ...
##  $ x1StarReviews        : int  0 0 0 9 36 40 1 9 2 0 ...
##  $ PositiveServiceReview: int  2 1 1 7 7 12 3 5 2 2 ...
##  $ NegativeServiceReview: int  0 0 0 8 20 5 0 3 1 0 ...
##  $ Recommendproduct     : num  0.9 0.9 0.9 0.8 0.7 0.3 0.9 0.7 0.8 0.9 ...
##  $ ShippingWeight       : num  25.8 50 17.4 5.7 7 1.6 7.3 12 1.8 0.75 ...
##  $ ProductDepth         : num  23.9 35 10.5 15 12.9 ...
##  $ ProductWidth         : num  6.62 31.75 8.3 9.9 0.3 ...
##  $ ProductHeight        : num  16.9 19 10.2 1.3 8.9 ...
##  $ ProfitMargin         : num  0.15 0.25 0.08 0.08 0.09 0.05 0.05 0.05 0.05 0.05 ...
##  $ Volume               : int  12 8 12 196 232 332 44 132 64 40 ...
# Visualize the data. First approach 
# for (col in 1:ncol(products2017Num)) { 
#   hist(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6", breaks=50 )}
# 
# for (col in 1:ncol(products2017Num)) { 
#   boxplot(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6" )}




#Decision Tree 1
products2017_ctree <- ctree(Volume ~ ., data=products2017, controls = ctree_control(maxdepth = 4))

plot(products2017_ctree)

#Correlation Matrix
corrData <- cor(products2017Num)
corrplot(corrData)

#Select variables. A. Without 5StarReviews
products2017CombA <- products2017[c(-2, -4)]

#Decision Tree. A
products2017_ctreeA <- ctree(Volume ~ ., data=products2017CombA, controls = ctree_control(maxdepth = 4))
plot(products2017_ctreeA)

#Positive and 4StarReviews are the most important variables. 

#FEATURE ENGINEERING

#Join the Stars Attributes because they are very correlated

products2017CombMerg <- products2017
products2017CombMerg <- products2017[c(-4:-8)]
products2017CombMerg$GoodReviews <- products2017$x4StarReviews + products2017$x3StarReviews
products2017CombMerg$BadReviews <- products2017$x2StarReviews + products2017$x1StarReviews


#Decision Tree B
products2017_ctreeB <- ctree(Volume ~ ., data=products2017CombMerg, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeB)

#Select variables:: Feature Eng. B.2 Without 5StarReviews, Without Categorical Variables
products2017CombMerg2 <- products2017CombMerg[c(-1, -2)]

#Correlation Matrix B
corrDataMerg2 <- cor(products2017CombMerg2)
corrplot(corrDataMerg2)

# Visualize the data. B
for (col in 1:ncol(products2017CombMerg2)) {
  hist(products2017CombMerg2[,col], main = names(products2017CombMerg2[col]), xlab=colnames(products2017CombMerg2[col]), border = "black", col = "#bcd4e6", breaks=200 )}

#There are some outliers in PositiveServiceReview and in the GoodReviews and BadReviews.
#So let's analyze how is their distribution

GoodReviewsNums <- products2017CombMerg %>% group_by(GoodReviews) %>% summarise (n = n()) 
GoodReviewsOutliers <- products2017CombMerg %>% filter(GoodReviews == '35')

#Products from 134 to 141 are duplicated (the only difference between them is the price)
#So let's remove them

#Select Rows:: Feature Eng. C Without duplicated rows
products2017CombMergClean <- products2017CombMerg[-c(34:41),]

#Select Rows:: Feature Eng. D All variables withour duplicated rows
products2017Clean <- products2017[-c(34:41),]

#Decision Tree C
products2017_ctreeC <- ctree(Volume ~ ., data=products2017CombMergClean, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeC)

#Creating dummy variables------
#Select variables:: Feature Eng. C.2 Without Categorical Variables
products2017CombMergClean2 <- products2017CombMergClean[c(-1, -2)]


# Visualize the data. C
for (col in 1:ncol(products2017CombMergClean2)) {
  hist(products2017CombMergClean2[,col], main = names(products2017CombMergClean2[col]), xlab=colnames(products2017CombMergClean2[col]), border = "black", col = "#bcd4e6", breaks=200 )}

ggplot(products2017CombMergClean, aes(x=ProductType, fill=Volume)) + geom_bar()

#We can see that there are still many outliers. 

#Detect outliers from the Standard Deviation


findOutlier <- function(products2017Clean, cutoff = 3) {
    ## Calculate the sd
    sdproducts2017Clean <- apply(products2017Clean, 2, sd, na.rm = TRUE)
    ## Identify the cells with value greater than cutoff * sd (column wise)
    result <- mapply(function(d, s) {
        which(d > cutoff * s)
    }, products2017Clean, sdproducts2017Clean)
    result
}

outliers <- findOutlier(products2017Clean)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm
## = na.rm): NAs introduced by coercion
## Warning in Ops.factor(d, cutoff * s): '>' not meaningful for factors
outliers
## $ProductType
## integer(0)
## 
## $ProductNum
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## [70] 70 71 72
## 
## $Price
## [1]  2  5 49
## 
## $x5StarReviews
## [1] 42 65
## 
## $x4StarReviews
## [1] 40 42 65
## 
## $x3StarReviews
## [1] 23 40 42 65
## 
## $x2StarReviews
## [1] 23 42
## 
## $x1StarReviews
## [1] 23
## 
## $PositiveServiceReview
## [1] 18 42
## 
## $NegativeServiceReview
## [1] 23 59
## 
## $Recommendproduct
##  [1]  1  2  3  4  5  7  8  9 10 13 14 15 16 17 18 19 20 21 22 24 25 26 28
## [24] 30 31 35 36 37 38 39 40 42 43 44 45 46 47 48 49 51 52 53 55 58 59 60
## [47] 61 63 65 66 67 68 69 71 72
## 
## $ShippingWeight
## [1]  2 27 28 60
## 
## $ProductDepth
## [1] 45
## 
## $ProductWidth
## [1]  2 26 28 40 58
## 
## $ProductHeight
## [1] 28 50
## 
## $ProfitMargin
## [1] 25 29 30 31 32 33
## 
## $Volume
## [1] 42 65
removeOutlier <- function(products2017Clean, outliers) {
    result <- mapply(function(d, o) {
        res <- d
        res[o] <- NA
        return(res)
    }, products2017Clean, outliers)
    return(as.data.frame(result))
}

#adding again the product numbers
#products2017CleanFilter$ProductNum <- products2017Clean$ProductNum 

#adding NA to the outliers
products2017CleanFilter <- removeOutlier(products2017Clean, outliers)
products2017CleanFilter$NumOutliers <- apply(products2017CleanFilter, 1, function(x) sum(is.na(x))-1)

nrow(is.na(products2017CleanFilter))
## [1] 72
hist(products2017CleanFilter$NumOutliers)

hist(products2017CleanFilter$Price)

hist(products2017Clean$Price)

products2017Clean %>% filter(Price > 1000) 
##   ProductType ProductNum   Price x5StarReviews x4StarReviews x3StarReviews
## 1          PC        102 2249.99             2             1             0
## 2      Laptop        105 1079.99            58            31            11
## 3     Display        157 1276.57           164            33             6
##   x2StarReviews x1StarReviews PositiveServiceReview NegativeServiceReview
## 1             0             0                     1                     0
## 2             7            36                     7                    20
## 3            13             6                    12                     4
##   Recommendproduct ShippingWeight ProductDepth ProductWidth ProductHeight
## 1              0.9             50         35.0        31.75          19.0
## 2              0.7              7         12.9         0.30           8.9
## 3              0.9             23         15.5        17.70           5.7
##   ProfitMargin Volume
## 1         0.25      8
## 2         0.09    232
## 3         0.25    656
#RecommendProduct results had many outliers. Why?
plot(products2017$Recommendproduct)

hist(products2017$Recommendproduct)

RecommendNums <- products2017Clean %>% group_by(Recommendproduct) %>% summarise (n = n()) 
RecommendproductOutliers <- products2017Clean %>% filter(Recommendproduct > 0.6) #Product 150, 198

#See the rows with more outliers and compare it with the table with the NA's
GeneralOutliers <- products2017Clean %>% filter(ProductNum == '198'| ProductNum == '150'| ProductNum ==  '102'|  ProductNum == '123'| ProductNum ==  '128'|  ProductNum == '148' )

#Conlusion about Outliers: 1. Products 123, 150, 198 should be removed because the have many outliers. 2. The attribute Recommendproduct has many otliers and should be removed. 3. The products with high price wont be deleted yet, first let's compare the models with and without them.


#Remove rows with outliers
 
OutlierstoRemove <- c(198,150,123)
products2017Clean2 <- products2017Clean %>% filter(!ProductNum %in% OutlierstoRemove )

products2017Clean2
##         ProductType ProductNum   Price x5StarReviews x4StarReviews
## 1                PC        101  949.00             3             3
## 2                PC        102 2249.99             2             1
## 3                PC        103  399.00             3             0
## 4            Laptop        104  409.99            49            19
## 5            Laptop        105 1079.99            58            31
## 6       Accessories        106  114.22            83            30
## 7       Accessories        107  379.99            11             3
## 8       Accessories        108   65.29            33            19
## 9       Accessories        109  119.99            16             9
## 10      Accessories        110   16.99            10             1
## 11      Accessories        111    6.55            21             2
## 12      Accessories        112   15.00            75            25
## 13      Accessories        113   52.50            10             8
## 14      Accessories        114   21.08           313            62
## 15      Accessories        115   18.98           349           118
## 16      Accessories        116    3.60             8             6
## 17      Accessories        117    3.60            11             5
## 18      Accessories        118  174.99           170           100
## 19      Accessories        119    9.99            15            12
## 20         Software        120  670.00            20             2
## 21         Software        121  133.08            34            15
## 22         Software        122  124.99           394           187
## 23         Software        124   48.50            29            18
## 24         Software        125   31.99            77            31
## 25          Display        126  179.99           306           114
## 26          Printer        127  396.35             8             0
## 27          Printer        128  262.98            22             8
## 28  PrinterSupplies        129   83.11             0             0
## 29  PrinterSupplies        130   26.78             6             2
## 30  PrinterSupplies        131   43.22             5             0
## 31 ExtendedWarranty        132   79.98             0             1
## 32 ExtendedWarranty        133   87.98             5             0
## 33               PC        142  609.99            21             7
## 34           Laptop        143  770.60            22            14
## 35      Accessories        144    3.84            92            29
## 36      Accessories        145    6.99           384            31
## 37      Accessories        146   59.99           366            59
## 38      Accessories        147   38.66           209            83
## 39      Accessories        148   10.75           535           261
## 40      Accessories        149  126.95            13             2
## 41      Accessories        151  103.85             3             0
## 42      Accessories        152   52.72            51            33
## 43      Accessories        153   19.99           474            38
## 44      Accessories        154   16.24           108            54
## 45      Accessories        155   69.00           226            37
## 46          Display        156  359.99            90            27
## 47          Display        157 1276.57           164            33
## 48          Display        158  783.98            26            13
## 49          Display        159  149.99            21            10
## 50          Printer        160  129.99            74            25
## 51          Printer        161  128.49            58            33
## 52          Printer        162  141.95             4             0
## 53          Printer        163  149.99             8             3
## 54          Printer        164  165.99             2             0
## 55          Printer        165  169.26            20            13
## 56          Printer        166  132.36             0             1
## 57          Printer        167  149.99           206            89
## 58          Printer        168  395.00             8             0
## 59          Printer        169  385.96            99            43
## 60          Netbook        177  379.99             1             0
## 61           Tablet        185  499.00           148            66
## 62       Smartphone        190  199.00             4             1
## 63           Tablet        188  499.00            86            51
## 64           Tablet        189  419.00             3             1
## 65       Smartphone        191  200.00            62            25
## 66       Smartphone        192   99.00            18            17
## 67          Netbook        182  349.99            22            10
## 68       Smartphone        197  499.00           368            28
## 69      GameConsole        200  299.99           421            87
##    x3StarReviews x2StarReviews x1StarReviews PositiveServiceReview
## 1              2             0             0                     2
## 2              0             0             0                     1
## 3              0             0             0                     1
## 4              8             3             9                     7
## 5             11             7            36                     7
## 6             10             9            40                    12
## 7              0             0             1                     3
## 8             12             5             9                     5
## 9              2             0             2                     2
## 10             1             0             0                     2
## 11             2             4            15                     2
## 12             6             3             3                     9
## 13             5             0             1                     2
## 14            13             8            16                    44
## 15            27             7             5                    57
## 16             3             2             1                     0
## 17             2             2             1                     0
## 18            23            20            20                   310
## 19             4             0             4                     3
## 20             4             2             4                     4
## 21             2             2            10                     5
## 22            63            42            86                    55
## 23             3             1             8                     4
## 24             7             3             2                     7
## 25            25            22            28                    42
## 26             1             0             2                     1
## 27             3             1             3                     5
## 28             0             1             3                     1
## 29             0             0             1                     1
## 30             0             0             0                     1
## 31             1             1             1                     0
## 32             2             0             1                     0
## 33             3             0            12                     5
## 34             4             5             6                     6
## 35             8             4            10                    12
## 36            13             5            20                    50
## 37            23            26            24                    60
## 38            35            38            36                    15
## 39           134           104           177                   120
## 40             2             2             5                     5
## 41             0             0             0                     1
## 42             8             7             4                     6
## 43             7             6            12                    80
## 44            12             2             6                     7
## 45             3             7             8                    13
## 46            10             4             4                     7
## 47             6            13             6                    12
## 48             7             5            16                     4
## 49             3             1             4                     4
## 50             7             6             9                     4
## 51            10             3             6                     5
## 52             0             0             3                     0
## 53             3             2             0                     0
## 54             1             1             2                     1
## 55             8             6            21                     4
## 56             0             0             0                     0
## 57            20            22            65                    42
## 58             1             0             2                     3
## 59            17            11            20                     8
## 60             1             1             0                     0
## 61            30            20            29                    12
## 62             0             2             2                     1
## 63            17            12             9                    14
## 64             0             0             0                     0
## 65            10            11            12                     9
## 66             6             2            12                     5
## 67             6             2            10                     3
## 68            14            10            23                    22
## 69            20            14            39                    29
##    NegativeServiceReview Recommendproduct ShippingWeight ProductDepth
## 1                      0              0.9          25.80        23.94
## 2                      0              0.9          50.00        35.00
## 3                      0              0.9          17.40        10.50
## 4                      8              0.8           5.70        15.00
## 5                     20              0.7           7.00        12.90
## 6                      5              0.3           1.60         5.80
## 7                      0              0.9           7.30         6.70
## 8                      3              0.7          12.00         7.90
## 9                      1              0.8           1.80        10.60
## 10                     0              0.9           0.75        10.70
## 11                     1              0.5           1.00         7.30
## 12                     2              0.2           2.20        21.30
## 13                     0              0.8           1.10        15.60
## 14                     3              0.9           0.35         5.70
## 15                     3              0.9           0.60         1.70
## 16                     0              0.8           0.01        11.50
## 17                     0              0.8           0.01        11.50
## 18                     6              0.8           1.40        13.80
## 19                     1              0.8           0.40        11.10
## 20                     3              0.7           0.25         5.80
## 21                     4              0.7           3.20         7.40
## 22                    38              0.8           0.15         7.60
## 23                     2              0.8           0.20         8.00
## 24                     0              0.9           0.20         0.00
## 25                    12              0.8          13.70         8.50
## 26                     1              0.3          63.00        17.90
## 27                     1              0.8          57.00        17.30
## 28                     1              0.1          10.30         0.00
## 29                     0              0.9           1.00         3.30
## 30                     0              1.0           1.00         4.70
## 31                     3              0.4           0.20         0.00
## 32                     1              0.6           0.20         0.00
## 33                     3              0.6          29.10        20.95
## 34                     2              0.7           3.54        12.72
## 35                     3              0.8           1.00         6.50
## 36                     1              0.9           1.00       108.00
## 37                     5              0.8           2.40         7.90
## 38                     4              0.7           0.40         3.60
## 39                    15              0.7           1.50        11.50
## 40                     1              0.6           3.00         5.90
## 41                     0              0.9           8.00         4.80
## 42                     1              0.8           4.00         6.60
## 43                     2              0.9           1.50       300.00
## 44                     3              0.9           0.55         6.00
## 45                     1              0.9           3.80        15.00
## 46                     3              0.9           7.00         9.20
## 47                     4              0.9          23.00        15.50
## 48                     5              0.6          25.00        29.20
## 49                     2              0.8          10.00        20.00
## 50                     2              0.9          32.20        15.70
## 51                     2              0.9          22.70        15.70
## 52                     1              0.5          25.00        19.50
## 53                     0              0.7          35.00        10.20
## 54                     1              0.5          31.00        22.10
## 55                     7              0.5          32.00        15.10
## 56                     0              0.8          30.20        20.90
## 57                    50              0.7          13.00         8.80
## 58                     0              0.8          63.00        17.90
## 59                    13              0.7          39.00        21.00
## 60                     1              0.3           3.00         7.44
## 61                     6              0.8           2.20         7.10
## 62                     1              0.5           1.10         4.50
## 63                     2              0.8           2.00        10.10
## 64                     0              0.9           2.20         7.00
## 65                     3              0.8           0.90         2.80
## 66                     4              0.7           0.70         2.80
## 67                     3              0.3           5.00         7.57
## 68                     3              0.9           0.90         2.70
## 69                    14              0.9          10.94        12.00
##    ProductWidth ProductHeight ProfitMargin Volume
## 1          6.62         16.89         0.15     12
## 2         31.75         19.00         0.25      8
## 3          8.30         10.20         0.08     12
## 4          9.90          1.30         0.08    196
## 5          0.30          8.90         0.09    232
## 6          4.00          1.00         0.05    332
## 7         10.30         11.50         0.05     44
## 8          6.70          2.20         0.05    132
## 9          9.40          4.70         0.05     64
## 10        13.10          0.60         0.05     40
## 11         7.00          1.60         0.05     84
## 12         1.80          7.80         0.05    300
## 13         3.00         15.00         0.05     40
## 14         3.50          8.30         0.05   1252
## 15        13.50         10.20         0.05   1396
## 16         8.50          0.40         0.05     32
## 17         8.50          0.40         0.05     44
## 18         8.20          0.40         0.05    680
## 19         7.60          0.50         0.05     60
## 20         1.40          7.80         0.15     80
## 21         5.50          1.40         0.15    136
## 22         5.50          1.20         0.20   1576
## 23         7.00          1.00         0.20    116
## 24         0.00          0.00         0.30    308
## 25        22.30         17.50         0.08   1224
## 26        15.90         12.70         0.12     32
## 27        23.50         25.80         0.12     88
## 28         0.00          0.00         0.35      0
## 29         1.60          4.70         0.30     24
## 30         2.90          6.30         0.30     20
## 31         0.00          0.00         0.40      0
## 32         0.00          0.00         0.40     20
## 33         8.47         20.71         0.09     84
## 34         8.90          0.71         0.15     88
## 35         4.20          1.50         0.05    368
## 36         0.50          0.50         0.05   1536
## 37         1.40         19.10         0.05   1464
## 38         6.50          8.80         0.05    836
## 39        20.00          7.20         0.05   2140
## 40         1.80          6.70         0.05     52
## 41         1.30          4.80         0.05     12
## 42         5.50          5.20         0.05    204
## 43         0.50          0.50         0.05   1896
## 44         3.00          8.20         0.05    432
## 45         9.80         18.90         0.05    904
## 46        11.20          3.20         0.20    360
## 47        17.70          5.70         0.25    656
## 48         9.90         23.00         0.16    104
## 49        15.30          6.40         0.17     84
## 50        15.90         12.40         0.10    296
## 51        15.90         10.60         0.12    232
## 52        18.00         14.00         0.14     16
## 53        15.98         14.55         0.18     32
## 54        18.60         13.50         0.18      8
## 55        11.70         19.60         0.16     80
## 56        20.90         14.60         0.20      0
## 57        13.70          7.60         0.15    824
## 58        15.90         12.70         0.09     32
## 59        15.40         17.90         0.11    396
## 60        10.43          1.02         0.10      4
## 61        10.40          0.30         0.13    592
## 62         2.50          0.50         0.10     16
## 63         6.90          0.38         0.20    344
## 64        10.20          0.40         0.18     12
## 65         5.40          0.30         0.14    248
## 66         5.30          0.40         0.17     72
## 67        10.47          1.43         0.12     88
## 68         5.00          0.40         0.10   1472
## 69        11.50          7.25         0.12   1684
#RecommendProduct














# box_plot <- boxplot(products2017[, c("","")])
#sort(products2017Num$NegativeServiceReview)